From: kaf24@scramble.cl.cam.ac.uk Date: Thu, 22 Apr 2004 13:56:30 +0000 (+0000) Subject: bitkeeper revision 1.872 (4087cf0eay7XY7T1xObNygn1qSwJ0g) X-Git-Tag: archive/raspbian/4.8.0-1+rpi1~1^2~18249 X-Git-Url: https://dgit.raspbian.org/%22http:/www.example.com/cgi/%22https:/%22bookmarks://%22Dat/%22http:/www.example.com/cgi/%22https:/%22bookmarks:/%22Dat?a=commitdiff_plain;h=8babea830827083a13dae8c037dcc210ad33018e;p=xen.git bitkeeper revision 1.872 (4087cf0eay7XY7T1xObNygn1qSwJ0g) Partial checkin of new blkdev backend in Xenolinux. Also updates to the mmu_update interface. --- diff --git a/.rootkeys b/.rootkeys index d27db0000a..b4940a3b04 100644 --- a/.rootkeys +++ b/.rootkeys @@ -671,12 +671,18 @@ 3e5a4e65ZxKrbFetVB84JhrTyZ1YuQ xenolinux-2.4.26-sparse/arch/xen/drivers/network/network.c 4083dc16z0jvZEH4PiVDbDRreaNp6w xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/Makefile 4083dc16KQus88a4U3uCV6qVCA6_8Q xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/Makefile +4087cf0dPeHOvzmZAazvwLslKEF93A xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/common.h +4087cf0da2cROOiybf9A-j4R_yHnjg xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/control.c +4087cf0dvXL1PKX23t_LvO1wVPb7OA xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/interface.c +4087cf0dkVF3I19gpT1cNubeJgQr7g xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/main.c +4087cf0dlv1Dw4MAbeRStPPG8IvPPg xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/vbd.c 4075806dI5kfeMD5RV-DA0PYoThx_w xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/frontend/Makefile 4075806d3fJqqDC1pYYPTZPc575iKg xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/frontend/block.c 4075806d4-j7vN0Mn0bklI1cRUX1vQ xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/frontend/block.h 4075806dibjCcfuXv6CINMhxWTw3jQ xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/frontend/vbd.c 4083dc16-Kd5y9psK_yk161sme5j5Q xenolinux-2.4.26-sparse/arch/xen/drivers/vnetif/Makefile 4083dc16UmHXxS9g_UFVnkUpN-oP2Q xenolinux-2.4.26-sparse/arch/xen/drivers/vnetif/backend/Makefile +4087cf0d5dudKw_DecIJgOhLlBF_0Q xenolinux-2.4.26-sparse/arch/xen/drivers/vnetif/backend/main.c 405853f2wg7JXZJNltspMwOZJklxgw xenolinux-2.4.26-sparse/arch/xen/drivers/vnetif/frontend/Makefile 405853f6nbeazrNyEWNHBuoSg2PiPA xenolinux-2.4.26-sparse/arch/xen/drivers/vnetif/frontend/vnetif.c 3e5a4e65lWzkiPXsZdzPt2RNnJGG1g xenolinux-2.4.26-sparse/arch/xen/kernel/Makefile diff --git a/tools/xc/lib/xc_linux_build.c b/tools/xc/lib/xc_linux_build.c index 6f10afde1a..7f81c924ad 100644 --- a/tools/xc/lib/xc_linux_build.c +++ b/tools/xc/lib/xc_linux_build.c @@ -165,7 +165,7 @@ static int setup_guestos(int xc_handle, memset(builddomain, 0, sizeof(*builddomain)); - if ( (pm_handle = init_pfn_mapper()) < 0 ) + if ( (pm_handle = init_pfn_mapper((domid_t)dom)) < 0 ) goto error_out; if ( (page_array = malloc(nr_pages * sizeof(unsigned long))) == NULL ) diff --git a/tools/xc/lib/xc_linux_restore.c b/tools/xc/lib/xc_linux_restore.c index d06804ed93..3decb28559 100644 --- a/tools/xc/lib/xc_linux_restore.c +++ b/tools/xc/lib/xc_linux_restore.c @@ -186,7 +186,7 @@ int xc_linux_restore(int xc_handle, } shared_info_frame = op.u.getdomaininfo.shared_info_frame; - if ( (pm_handle = init_pfn_mapper()) < 0 ) + if ( (pm_handle = init_pfn_mapper((domid_t)dom)) < 0 ) goto out; /* Copy saved contents of shared-info page. No checking needed. */ diff --git a/tools/xc/lib/xc_linux_save.c b/tools/xc/lib/xc_linux_save.c index a702a4a292..dc759f546c 100644 --- a/tools/xc/lib/xc_linux_save.c +++ b/tools/xc/lib/xc_linux_save.c @@ -178,7 +178,7 @@ int xc_linux_save(int xc_handle, goto out; } - if ( (pm_handle = init_pfn_mapper()) < 0 ) + if ( (pm_handle = init_pfn_mapper((domid_t)domid)) < 0 ) goto out; /* Is the suspend-record MFN actually valid for this domain? */ diff --git a/tools/xc/lib/xc_netbsd_build.c b/tools/xc/lib/xc_netbsd_build.c index db5552d26e..8793a512f2 100644 --- a/tools/xc/lib/xc_netbsd_build.c +++ b/tools/xc/lib/xc_netbsd_build.c @@ -80,7 +80,7 @@ static int setup_guestos(int xc_handle, memset(builddomain, 0, sizeof(*builddomain)); - if ( (pm_handle = init_pfn_mapper()) < 0 ) + if ( (pm_handle = init_pfn_mapper((domid_t)dom)) < 0 ) goto error_out; if ( (page_array = malloc(tot_pages * sizeof(unsigned long))) == NULL ) diff --git a/tools/xc/lib/xc_private.c b/tools/xc/lib/xc_private.c index 3b634148eb..485aa58754 100644 --- a/tools/xc/lib/xc_private.c +++ b/tools/xc/lib/xc_private.c @@ -6,9 +6,15 @@ #include "xc_private.h" -int init_pfn_mapper(void) +int init_pfn_mapper(domid_t domid) { - return open("/dev/mem", O_RDWR); + int fd = open("/dev/mem", O_RDWR); + if ( fd >= 0 ) + { + (void)ioctl(fd, _IO('M', 1), (unsigned long)(domid>> 0)); /* low */ + (void)ioctl(fd, _IO('M', 2), (unsigned long)(domid>>32)); /* high */ + } + return fd; } int close_pfn_mapper(int pm_handle) @@ -49,7 +55,7 @@ static int flush_mmu_updates(int xc_handle, mmu_t *mmu) if ( mmu->idx == FIRST_MMU_UPDATE ) return 0; - /* The first two requests set the correct subject domain. */ + /* The first two requests set the correct subject domain (PTS and GPS). */ mmu->updates[0].val = (unsigned long)(mmu->subject<<16) & ~0xFFFFUL; mmu->updates[0].ptr = (unsigned long)(mmu->subject<< 0) & ~0xFFFFUL; mmu->updates[1].val = (unsigned long)(mmu->subject>>16) & ~0xFFFFUL; @@ -57,7 +63,7 @@ static int flush_mmu_updates(int xc_handle, mmu_t *mmu) mmu->updates[0].ptr |= MMU_EXTENDED_COMMAND; mmu->updates[0].val |= MMUEXT_SET_SUBJECTDOM_L; mmu->updates[1].ptr |= MMU_EXTENDED_COMMAND; - mmu->updates[1].val |= MMUEXT_SET_SUBJECTDOM_H; + mmu->updates[1].val |= MMUEXT_SET_SUBJECTDOM_H | SET_PAGETABLE_SUBJECTDOM; hypercall.op = __HYPERVISOR_mmu_update; hypercall.arg[0] = (unsigned long)mmu->updates; diff --git a/tools/xc/lib/xc_private.h b/tools/xc/lib/xc_private.h index b6c78b74fd..d4299109e5 100644 --- a/tools/xc/lib/xc_private.h +++ b/tools/xc/lib/xc_private.h @@ -154,7 +154,7 @@ static inline int do_block_io_op(int xc_handle, block_io_op_t *op) /* * PFN mapping. */ -int init_pfn_mapper(void); +int init_pfn_mapper(domid_t domid); int close_pfn_mapper(int pm_handle); void *map_pfn_writeable(int pm_handle, unsigned long pfn); void *map_pfn_readonly(int pm_handle, unsigned long pfn); diff --git a/tools/xend/lib/domain_controller.h b/tools/xend/lib/domain_controller.h index a6ac3b4c92..6a49630113 100644 --- a/tools/xend/lib/domain_controller.h +++ b/tools/xend/lib/domain_controller.h @@ -49,8 +49,116 @@ typedef struct { CONTROL_RING_IDX rx_req_prod, rx_resp_prod; } control_if_t; -#define CMSG_CONSOLE 0 -#define CMSG_CONSOLE_DATA 0 +/* + * Top-level command types. + */ +#define CMSG_CONSOLE 0 /* Console */ +#define CMSG_BLKIF_BE 1 /* Block-device backend */ +#define CMSG_BLKIF_FE 2 /* Block-device frontend */ + +/* + * Subtypes for console messages. + */ +#define CMSG_CONSOLE_DATA 0 + +/* + * Subtypes for block-device messages. + */ +#define CMSG_BLKIF_BE_CREATE 0 /* Create a new block-device interface. */ +#define CMSG_BLKIF_BE_DESTROY 1 /* Destroy a block-device interface. */ +#define CMSG_BLKIF_BE_VBD_CREATE 2 /* Create a new VBD for an interface. */ +#define CMSG_BLKIF_BE_VBD_DESTROY 3 /* Delete a VBD from an interface. */ +#define CMSG_BLKIF_BE_VBD_GROW 4 /* Append an extent to a given VBD. */ +#define CMSG_BLKIF_BE_VBD_SHRINK 5 /* Remove last extent from a given VBD. */ + +/* + * Message request/response defintions for block-device messages. + */ + +typedef u16 blkif_vdev_t; +typedef u16 blkif_pdev_t; +typedef u64 blkif_sector_t; +typedef struct { + blkif_pdev_t device; + blkif_sector_t sector_start; + blkif_sector_t sector_length; +} blkif_extent_t; + +/* Non-specific 'okay' return. */ +#define BLKIF_STATUS_OKAY 0 +/* Non-specific 'error' return. */ +#define BLKIF_STATUS_ERROR 1 +/* The following are specific error returns. */ +#define BLKIF_STATUS_INTERFACE_EXISTS 2 +#define BLKIF_STATUS_INTERFACE_NOT_FOUND 3 + +/* This macro can be used to create an array of descriptive error strings. */ +#define BLKIF_STATUS_ERRORS { \ + "Okay", \ + "Non-specific error", \ + "Interface already exists", \ + "Interface not found" } + +/* CMSG_BLKIF_CREATE */ +typedef struct { + /* IN */ + domid_t domid; /* Domain attached to new interface. */ + unsigned int blkif_handle; /* Domain-specific interface handle. */ + unsigned int evtchn_port; /* Event channel for notifications. */ + unsigned long shmem_frame; /* Page cont. shared comms window. */ + /* OUT */ + unsigned int status; +} blkif_create_t; + +/* CMSG_BLKIF_DESTROY */ +typedef struct { + /* IN */ + domid_t domid; /* Identify interface to be destroyed. */ + unsigned int blkif_handle; /* ...ditto... */ + /* OUT */ + unsigned int status; +} blkif_destroy_t; + +/* CMSG_BLKIF_VBD_CREATE */ +typedef struct { + /* IN */ + domid_t domid; /* Identify blkdev interface. */ + unsigned int blkif_handle; /* ...ditto... */ + blkif_vdev_t vdevice; /* Interface-specific id for this VBD. */ + int readonly; /* Non-zero -> VBD isn't writeable. */ + /* OUT */ + unsigned int status; +} blkif_vbd_create_t; + +/* CMSG_BLKIF_VBD_DESTROY */ +typedef struct { + /* IN */ + domid_t domid; /* Identify blkdev interface. */ + unsigned int blkif_handle; /* ...ditto... */ + blkif_vdev_t vdevice; /* Interface-specific id of the VBD. */ + /* OUT */ + unsigned int status; +} blkif_vbd_destroy_t; + +/* CMSG_BLKIF_VBD_GROW */ +typedef struct { + /* IN */ + domid_t domid; /* Identify blkdev interface. */ + unsigned int blkif_handle; /* ...ditto... */ + blkif_vdev_t vdevice; /* Interface-specific id of the VBD. */ + blkif_extent_t extent; /* Physical extent to append to VBD. */ + /* OUT */ + unsigned int status; +} blkif_vbd_grow_t; +/* CMSG_BLKIF_VBD_SHRINK */ +typedef struct { + /* IN */ + domid_t domid; /* Identify blkdev interface. */ + unsigned int blkif_handle; /* ...ditto... */ + blkif_vdev_t vdevice; /* Interface-specific id of the VBD. */ + /* OUT */ + unsigned int status; +} blkif_vbd_shrink_t; #endif /* __DOMAIN_CONTROLLER_H__ */ diff --git a/tools/xend/lib/utils.c b/tools/xend/lib/utils.c index ea2cee05d5..4883ec1a46 100644 --- a/tools/xend/lib/utils.c +++ b/tools/xend/lib/utils.c @@ -674,6 +674,10 @@ static PyObject *xu_port_new(PyObject *self, PyObject *args) goto fail1; } + /* Set the General-Purpose Subject whose page frame will be mapped. */ + (void)ioctl(xup->mem_fd, _IO('M', 1), (unsigned long)(dom>> 0)); /* low */ + (void)ioctl(xup->mem_fd, _IO('M', 2), (unsigned long)(dom>>32)); /* high */ + if ( (xup->xc_handle = xc_interface_open()) == -1 ) { PyErr_SetString(port_error, "Could not open Xen control interface"); diff --git a/xen/common/memory.c b/xen/common/memory.c index 7c94748e07..6d3dc9ead4 100644 --- a/xen/common/memory.c +++ b/xen/common/memory.c @@ -151,12 +151,10 @@ static int alloc_l2_table(struct pfn_info *page); static int alloc_l1_table(struct pfn_info *page); -static int get_page_from_pagenr(unsigned long page_nr, int check_level); +static int get_page_from_pagenr(unsigned long page_nr, struct task_struct *p); static int get_page_and_type_from_pagenr(unsigned long page_nr, u32 type, - int check_level); -#define CHECK_STRICT 0 /* Subject domain must own the page */ -#define CHECK_ANYDOM 1 /* Any domain may own the page (if subject is priv.) */ + struct task_struct *p); static void free_l2_table(struct pfn_info *page); static void free_l1_table(struct pfn_info *page); @@ -180,9 +178,14 @@ static struct { unsigned long deferred_ops; unsigned long cr0; domid_t subject_id; - struct task_struct *subject_p; + /* General-Purpose Subject, Page-Table Subject */ + struct task_struct *gps, *pts; } percpu_info[NR_CPUS] __cacheline_aligned; +/* Determine the current General-Purpose Subject or Page-Table Subject. */ +#define PTS (percpu_info[smp_processor_id()].pts ? : current) +#define GPS (percpu_info[smp_processor_id()].gps ? : current) + /* * init_frametable: @@ -295,11 +298,9 @@ int map_ldt_shadow_page(unsigned int off) } -static int get_page_from_pagenr(unsigned long page_nr, int check_level) +static int get_page_from_pagenr(unsigned long page_nr, struct task_struct *p) { - struct task_struct *p = current; struct pfn_info *page = &frame_table[page_nr]; - u32 y, x, nx; if ( unlikely(!pfn_is_ram(page_nr)) ) { @@ -307,37 +308,10 @@ static int get_page_from_pagenr(unsigned long page_nr, int check_level) return 0; } - /* Find the correct subject domain. */ - if ( unlikely(percpu_info[p->processor].subject_p != NULL) ) - p = percpu_info[p->processor].subject_p; - - /* Demote ANYDOM to STRICT if subject domain is not privileged. */ - if ( check_level == CHECK_ANYDOM && !IS_PRIV(p) ) - check_level = CHECK_STRICT; - - switch ( check_level ) + if ( unlikely(!get_page(page, p)) ) { - case CHECK_STRICT: - if ( unlikely(!get_page(page, p)) ) - { - MEM_LOG("Could not get page ref for pfn %08lx\n", page_nr); - return 0; - } - break; - case CHECK_ANYDOM: - y = page->count_and_flags; - do { - x = y; - nx = x + 1; - if ( unlikely((x & PGC_count_mask) == 0) || - unlikely((nx & PGC_count_mask) == 0) ) - { - MEM_LOG("Could not get page ref for pfn %08lx\n", page_nr); - return 0; - } - } - while ( unlikely((y = cmpxchg(&page->count_and_flags, x, nx)) != x) ); - break; + MEM_LOG("Could not get page ref for pfn %08lx\n", page_nr); + return 0; } return 1; @@ -346,11 +320,11 @@ static int get_page_from_pagenr(unsigned long page_nr, int check_level) static int get_page_and_type_from_pagenr(unsigned long page_nr, u32 type, - int check_level) + struct task_struct *p) { struct pfn_info *page = &frame_table[page_nr]; - if ( unlikely(!get_page_from_pagenr(page_nr, check_level)) ) + if ( unlikely(!get_page_from_pagenr(page_nr, p)) ) return 0; if ( unlikely(!get_page_type(page, type)) ) @@ -391,8 +365,7 @@ static int get_linear_pagetable(l2_pgentry_t l2e, unsigned long pfn) if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn ) { /* Make sure the mapped frame belongs to the correct domain. */ - if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e), - CHECK_STRICT)) ) + if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e), PTS)) ) return 0; /* @@ -443,14 +416,14 @@ static int get_page_from_l1e(l1_pgentry_t l1e) if ( l1v & _PAGE_RW ) { if ( unlikely(!get_page_and_type_from_pagenr( - pfn, PGT_writeable_page, CHECK_ANYDOM)) ) + pfn, PGT_writeable_page, GPS)) ) return 0; set_bit(_PGC_tlb_flush_on_type_change, &frame_table[pfn].count_and_flags); return 1; } - return get_page_from_pagenr(pfn, CHECK_ANYDOM); + return get_page_from_pagenr(pfn, GPS); } @@ -468,7 +441,7 @@ static int get_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn) } if ( unlikely(!get_page_and_type_from_pagenr( - l2_pgentry_to_pagenr(l2e), PGT_l1_page_table, CHECK_STRICT)) ) + l2_pgentry_to_pagenr(l2e), PGT_l1_page_table, PTS)) ) return get_linear_pagetable(l2e, pfn); return 1; @@ -771,12 +744,12 @@ void free_page_type(struct pfn_info *page, unsigned int type) page-frame_table) & PSH_shadowed) ) { /* - * Using 'current->mm' is safe and correct because page-table pages - * are not shared across domains. Updates to such pages' types are - * thus only done within the context of the owning domain. The one - * exception is when destroying a domain; however, this is not a - * problem as the currently-executing domain will not have this - * MFN shadowed, and at domain end-of-day we explicitly unshadow + * Using 'current->mm' is safe and correct because page-table pages + * are not shared across domains. Updates to such pages' types are + * thus only done within the context of the owning domain. The one + * exception is when destroying a domain; however, this is not a + * problem as the currently-executing domain will not have this MFN + * shadowed, and at domain end-of-day we explicitly unshadow * everything so that nothing will get left lying around. */ unshadow_table( page-frame_table, type ); @@ -814,9 +787,9 @@ static int do_extended_command(unsigned long ptr, unsigned long val) case MMUEXT_PIN_L1_TABLE: case MMUEXT_PIN_L2_TABLE: okay = get_page_and_type_from_pagenr( - pfn, (cmd == MMUEXT_PIN_L2_TABLE) ? PGT_l2_page_table : - PGT_l1_page_table, - CHECK_STRICT); + pfn, + (cmd==MMUEXT_PIN_L2_TABLE) ? PGT_l2_page_table : PGT_l1_page_table, + PTS); if ( unlikely(!okay) ) { MEM_LOG("Error while pinning pfn %08lx", pfn); @@ -836,7 +809,7 @@ static int do_extended_command(unsigned long ptr, unsigned long val) break; case MMUEXT_UNPIN_TABLE: - if ( unlikely(!(okay = get_page_from_pagenr(pfn, CHECK_STRICT))) ) + if ( unlikely(!(okay = get_page_from_pagenr(pfn, PTS))) ) { MEM_LOG("Page %08lx bad domain (dom=%p)", ptr, page->u.domain); @@ -856,8 +829,7 @@ static int do_extended_command(unsigned long ptr, unsigned long val) break; case MMUEXT_NEW_BASEPTR: - okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, - CHECK_STRICT); + okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, current); if ( likely(okay) ) { invalidate_shadow_ldt(); @@ -890,7 +862,7 @@ static int do_extended_command(unsigned long ptr, unsigned long val) break; case MMUEXT_INVLPG: - __flush_tlb_one(val & ~MMUEXT_CMD_MASK); + __flush_tlb_one(ptr); break; case MMUEXT_SET_LDT: @@ -932,11 +904,13 @@ static int do_extended_command(unsigned long ptr, unsigned long val) } else { - if ( percpu_info[cpu].subject_p != NULL ) - put_task_struct(percpu_info[cpu].subject_p); - percpu_info[cpu].subject_p = find_domain_by_id( + if ( percpu_info[cpu].gps != NULL ) + put_task_struct(percpu_info[cpu].gps); + percpu_info[cpu].gps = find_domain_by_id( percpu_info[cpu].subject_id); - if ( percpu_info[cpu].subject_p == NULL ) + percpu_info[cpu].pts = (val & SET_PAGETABLE_SUBJECTDOM) ? + percpu_info[cpu].gps : NULL; + if ( percpu_info[cpu].gps == NULL ) { MEM_LOG("Unknown domain '%llu'", percpu_info[cpu].subject_id); okay = 0; @@ -987,7 +961,7 @@ int do_mmu_update(mmu_update_t *ureqs, int count) * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table. */ case MMU_NORMAL_PT_UPDATE: - if ( unlikely(!get_page_from_pagenr(pfn, CHECK_STRICT)) ) + if ( unlikely(!get_page_from_pagenr(pfn, PTS)) ) { MEM_LOG("Could not get page for normal update"); break; @@ -1059,7 +1033,7 @@ int do_mmu_update(mmu_update_t *ureqs, int count) break; case MMU_MACHPHYS_UPDATE: - if ( unlikely(!get_page_from_pagenr(pfn, CHECK_STRICT)) ) + if ( unlikely(!get_page_from_pagenr(pfn, GPS)) ) { MEM_LOG("Could not get page for mach->phys update"); break; @@ -1108,10 +1082,10 @@ int do_mmu_update(mmu_update_t *ureqs, int count) if ( deferred_ops & DOP_RELOAD_LDT ) (void)map_ldt_shadow_page(0); - if ( unlikely(percpu_info[cpu].subject_p != NULL) ) + if ( unlikely(percpu_info[cpu].gps != NULL) ) { - put_task_struct(percpu_info[cpu].subject_p); - percpu_info[cpu].subject_p = NULL; + put_task_struct(percpu_info[cpu].gps); + percpu_info[cpu].gps = percpu_info[cpu].pts = NULL; } return rc; diff --git a/xen/include/hypervisor-ifs/hypervisor-if.h b/xen/include/hypervisor-ifs/hypervisor-if.h index a196832eb9..8660d86ed5 100644 --- a/xen/include/hypervisor-ifs/hypervisor-if.h +++ b/xen/include/hypervisor-ifs/hypervisor-if.h @@ -71,22 +71,73 @@ #define NR_VIRQS 12 /* - * MMU_XXX: specified in least 2 bits of 'ptr' field. These bits are masked - * off to get the real 'ptr' value. - * All requests specify relevent address in 'ptr'. This is either a - * machine/physical address (MA), or linear/virtual address (VA). - * Normal requests specify update value in 'value'. - * Extended requests specify command in least 8 bits of 'value'. These bits - * are masked off to get the real 'val' value. Except for MMUEXT_SET_LDT - * which shifts the least bits out. + * MMU-UPDATE REQUESTS + * + * HYPERVISOR_mmu_update() accepts a list of (ptr, val) pairs. + * ptr[1:0] specifies the appropriate MMU_* command. + * + * GPS (General-Purpose Subject) + * ----------------------------- + * This domain that must own all non-page-table pages that are involved in + * MMU updates. By default it is the domain that executes mmu_update(). If the + * caller has sufficient privilege then it can be changed by executing + * MMUEXT_SET_SUBJECTDOM_{L,H}. + * + * PTS (Page-Table Subject) + * ------------------------ + * This domain must own all the page-table pages that are subject to MMU + * updates. By default it is the domain that executes mmu_update(). If the + * caller has sufficient privilege then it can be changed by executing + * MMUEXT_SET_SUBJECTDOM_H with val[14] (SET_PAGETABLE_SUBJECTDOM) set. + * + * ptr[1:0] == MMU_NORMAL_PT_UPDATE: + * Updates an entry in a page table. + * ptr[:2] -- machine address of the page-table entry to modify [1] + * val -- value to write [2] + * + * ptr[1:0] == MMU_MACHPHYS_UPDATE: + * Updates an entry in the machine->pseudo-physical mapping table. + * ptr[:2] -- machine address within the frame whose mapping to modify [3] + * val -- value to write into the mapping entry + * + * ptr[1:0] == MMU_EXTENDED_COMMAND: + * val[7:0] -- MMUEXT_* command + * + * val[7:0] == MMUEXT_(UN)PIN_*_TABLE: + * ptr[:2] -- machine address of frame to be (un)pinned as a p.t. page [1] + * + * val[7:0] == MMUEXT_NEW_BASEPTR: + * ptr[:2] -- machine address of new page-table base to install in MMU [1] + * + * val[7:0] == MMUEXT_TLB_FLUSH: + * no additional arguments + * + * val[7:0] == MMUEXT_INVLPG: + * ptr[:2] -- linear address to be flushed from the TLB + * + * val[7:0] == MMUEXT_SET_LDT: + * ptr[:2] -- linear address of LDT base (NB. must be page-aligned) + * val[:8] -- number of entries in LDT + * + * val[7:0] == MMUEXT_SET_SUBJECTDOM_L: + * (ptr[31:15],val[31:15]) -- dom[31:0] + * + * val[7:0] == MMUEXT_SET_SUBJECTDOM_H: + * val[14] -- if TRUE then sets the PTS in addition to the GPS. + * (ptr[31:15],val[31:15]) -- dom[63:32] + * NB. This command must be immediately preceded by SET_SUBJECTDOM_L. + * + * Notes on constraints on the above arguments: + * [1] The page frame containing the machine address must belong to the PTS. + * [2] If the PTE is valid (i.e., bit 0 is set) then the specified page frame + * must belong to: + * (a) the PTS (if the PTE is part of a non-L1 table); or + * (b) the GPS (if the PTE is part of an L1 table). + * [3] The page frame containing the machine address must belong to the GPS. */ -/* A normal page-table update request. */ #define MMU_NORMAL_PT_UPDATE 0 /* checked '*ptr = val'. ptr is MA. */ -/* Update an entry in the machine->physical mapping table. */ #define MMU_MACHPHYS_UPDATE 2 /* ptr = MA of frame to modify entry for */ -/* An extended command. */ #define MMU_EXTENDED_COMMAND 3 /* least 8 bits of val demux further */ -/* Extended commands: */ #define MMUEXT_PIN_L1_TABLE 0 /* ptr = MA of frame to pin */ #define MMUEXT_PIN_L2_TABLE 1 /* ptr = MA of frame to pin */ #define MMUEXT_PIN_L3_TABLE 2 /* ptr = MA of frame to pin */ @@ -94,11 +145,12 @@ #define MMUEXT_UNPIN_TABLE 4 /* ptr = MA of frame to unpin */ #define MMUEXT_NEW_BASEPTR 5 /* ptr = MA of new pagetable base */ #define MMUEXT_TLB_FLUSH 6 /* ptr = NULL */ -#define MMUEXT_INVLPG 7 /* ptr = NULL ; val = VA to invalidate */ +#define MMUEXT_INVLPG 7 /* ptr = VA to invalidate */ #define MMUEXT_SET_LDT 8 /* ptr = VA of table; val = # entries */ /* NB. MMUEXT_SET_SUBJECTDOM must consist of *_L followed immediately by *_H */ #define MMUEXT_SET_SUBJECTDOM_L 9 /* (ptr[31:15],val[31:15]) = dom[31:0] */ #define MMUEXT_SET_SUBJECTDOM_H 10 /* (ptr[31:15],val[31:15]) = dom[63:32] */ +#define SET_PAGETABLE_SUBJECTDOM (1<<14) /* OR into 'val' arg of SUBJECTDOM_H*/ #define MMUEXT_CMD_MASK 255 #define MMUEXT_CMD_SHIFT 8 diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/Makefile b/xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/Makefile index 032d02d7cc..4c8c17367c 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/Makefile +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/Makefile @@ -1,3 +1,3 @@ O_TARGET := drv.o -obj-y := main.o +obj-y := main.o control.o interface.o vbd.o include $(TOPDIR)/Rules.make diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/common.h b/xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/common.h new file mode 100644 index 0000000000..865c241f90 --- /dev/null +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/common.h @@ -0,0 +1,94 @@ +/****************************************************************************** + * arch/xen/drivers/vblkif/backend/common.h + */ + +#ifndef __VBLKIF__BACKEND__COMMON_H__ +#define __VBLKIF__BACKEND__COMMON_H__ + +#include +#include +#include +#include +#include +#include +#include + +#ifndef NDEBUG +#define ASSERT(_p) \ + if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \ + __LINE__, __FILE__); *(int*)0=0; } +#define DPRINTK(_f, _a...) printk("(file=%s, line=%d) " _f, \ + __FILE__ , __LINE__ , ## _a ) +#else +#define ASSERT(_p) ((void)0) +#define DPRINTK(_f, _a...) ((void)0) +#endif + +typedef struct { + /* Unique identifier for this interface. */ + domid_t domid; + unsigned int handle; + /* Physical parameters of the comms window. */ + unsigned long shmem_frame; + unsigned int evtchn; + int irq; + /* Comms information. */ + blk_ring_t *blk_ring_base; /* ioremap()'ed ptr to shmem_frame. */ + BLK_RING_IDX blk_req_cons; /* Request consumer. */ + BLK_RING_IDX blk_resp_prod; /* Private version of response producer. */ + /* VBDs attached to this interface. */ + rb_root_t vbd_rb; /* Mapping from 16-bit vdevices to VBDs. */ + spinlock_t vbd_lock; /* Protects VBD mapping. */ + /* Private fields. */ + struct list_head blkdev_list; + spinlock_t blk_ring_lock; +} blkif_t; + +blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle); +void blkif_get(blkif_t *blkif); +void blkif_put(blkif_t *blkif); + +/* An entry in a list of xen_extents. */ +typedef struct _blkif_extent_le { + blkif_extent_t extent; /* an individual extent */ + struct _blkif_extent_le *next; /* and a pointer to the next */ +} blkif_extent_le_t; + +typedef struct _vbd { + blkif_vdev_t vdevice; /* what the domain refers to this vbd as */ + unsigned char mode; /* VBD_MODE_{R,W} */ + unsigned char type; /* XD_TYPE_xxx */ + blkif_extent_le_t *extents; /* list of xen_extents making up this vbd */ + rb_node_t rb; /* for linking into R-B tree lookup struct */ +} vbd_t; + +long vbd_create(blkif_vbd_create_t *create_params); +long vbd_grow(blkif_vbd_grow_t *grow_params); +long vbd_shrink(blkif_vbd_shrink_t *shrink_params); +long vbd_destroy(blkif_vbd_destroy_t *delete_params); + +void destroy_all_vbds(struct task_struct *p); + +typedef struct { + blkif_t *blkif; + unsigned long id; + atomic_t pendcnt; + unsigned short operation; + unsigned short status; +} pending_req_t; + +/* Describes a [partial] disk extent (part of a block io request) */ +typedef struct { + unsigned short dev; + unsigned short nr_sects; + unsigned long buffer; + xen_sector_t sector_number; +} phys_seg_t; + +int vbd_translate(phys_seg_t *pseg, blkif_t *blkif, int operation); + +int vblkif_be_controller_init(void); + +void vblkif_be_int(int irq, void *dev_id, struct pt_regs *regs); + +#endif /* __VBLKIF__BACKEND__COMMON_H__ */ diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/control.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/control.c new file mode 100644 index 0000000000..a662d9c76e --- /dev/null +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/control.c @@ -0,0 +1,60 @@ +/****************************************************************************** + * arch/xen/drivers/vblkif/backend/control.c + * + * Routines for interfacing with the control plane. + * + * Copyright (c) 2004, Keir Fraser + */ + +#include "common.h" + +static void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) +{ + switch ( msg->subtype ) + { + case CMSG_BLKIF_BE_CREATE: + if ( msg->length != sizeof(blkif_create_t) ) + goto parse_error; + blkif_create((blkif_create_t *)&msg->msg[0]); + break; + case CMSG_BLKIF_BE_DESTROY: + if ( msg->length != sizeof(blkif_destroy_t) ) + goto parse_error; + blkif_destroy((blkif_destroy_t *)&msg->msg[0]); + break; + case CMSG_BLKIF_BE_VBD_CREATE: + if ( msg->length != sizeof(blkif_vbd_create_t) ) + goto parse_error; + vbd_create((blkif_vbd_create_t *)&msg->msg[0]); + break; + case CMSG_BLKIF_BE_VBD_DESTROY: + if ( msg->length != sizeof(blkif_vbd_destroy_t) ) + goto parse_error; + vbd_destroy((blkif_vbd_destroy_t *)&msg->msg[0]); + break; + case CMSG_BLKIF_BE_VBD_GROW: + if ( msg->length != sizeof(blkif_vbd_grow_t) ) + goto parse_error; + vbd_grow((blkif_vbd_grow_t *)&msg->msg[0]); + break; + case CMSG_BLKIF_BE_VBD_SHRINK: + if ( msg->length != sizeof(blkif_vbd_shrink_t) ) + goto parse_error; + vbd_shrink((blkif_vbd_shrink_t *)&msg->msg[0]); + break; + default: + goto parse_error; + } + + ctrl_if_send_response(msg); + return; + + parse_error: + msg->length = 0; + ctrl_if_send_response(msg); +} + +int blkif_ctrlif_init(void) +{ + (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx); +} diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/interface.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/interface.c new file mode 100644 index 0000000000..0a42bc5b87 --- /dev/null +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/interface.c @@ -0,0 +1,96 @@ +/****************************************************************************** + * arch/xen/drivers/vblkif/backend/interface.c + * + * Block-device interface management. + * + * Copyright (c) 2004, Keir Fraser + */ + +#include "common.h" + +#define BLKIF_HASHSZ 1024 +#define BLKIF_HASH(_d,_h) \ + (((int)(_d)^(int)((_d)>>32)^(int)(_h))&(BLKIF_HASHSZ-1)) + +static blkif_t *blkif_hash[BLKIF_HASHSZ]; + +blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle) +{ + blkif_t *blkif = &blkif_hash[BLKIF_HASH(domid, handle)]; + while ( (blkif != NULL) && + (blkif->domid != domid) && + (blkif->handle != handle) ) + blkif = blkif->hash_next; + return blkif; +} + +static void blkif_create(blkif_create_t *create) +{ + domid_t domid = create->domid; + unsigned int handle = create->blkif_handle; + unsigned int evtchn = create->evtchn; + unsigned long shmem_frame = create->shmem_frame; + blkif_t **pblkif, *blkif; + + pblkif = &blkif_hash[BLKIF_HASH(domid, handle)]; + while ( *pblkif == NULL ) + { + if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) ) + goto found_match; + pblkif = &(*pblkif)->hash_next; + } + + blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL); + memset(blkif, 0, sizeof(*blkif)); + blkif->domid = domid; + blkif->handle = handle; + blkif->evtchn = evtchn; + blkif->irq = bind_evtchn_to_irq(evtchn); + blkif->shmem_frame = shmem_frame; + blkif->shmem_vbase = ioremap(shmem_frame<vbd_lock); + spin_lock_init(&blkif->blk_ring_lock); + + request_irq(irq, vblkif_be_int, 0, "vblkif-backend", blkif); + + blkif->hash_next = *pblkif; + *pblkif = blkif; + + create->status = BLKIF_STATUS_OKAY; + return; + + found_match: + create->status = BLKIF_STATUS_INTERFACE_EXISTS; + return; + + evtchn_in_use: + unbind_evtchn_from_irq(evtchn); /* drop refcnt */ + create->status = BLKIF_STATUS_ERROR; + return; +} + +static void blkif_destroy(blkif_destroy_t *destroy) +{ + domid_t domid = destroy->domid; + unsigned int handle = destroy->blkif_handle; + blkif_t **pblkif, *blkif; + + pblkif = &blkif_hash[BLKIF_HASH(domid, handle)]; + while ( (blkif = *pblkif) == NULL ) + { + if ( (blkif->domid == domid) && (blkif->handle == handle) ) + goto found_match; + pblkif = &blkif->hash_next; + } + + destroy->status = BLKIF_STATUS_NO_INTERFACE; + return; + + found_match: + free_irq(blkif->irq, NULL); + unbind_evtchn_from_irq(blkif->evtchn); + *pblkif = blkif->hash_next; + kmem_cache_free(blkif_cachep, blkif); + destroy->status = BLKIF_STATUS_OKAY; +} + diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/main.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/main.c new file mode 100644 index 0000000000..cb44ac173b --- /dev/null +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/main.c @@ -0,0 +1,508 @@ +/****************************************************************************** + * arch/xen/drivers/vblkif/backend/main.c + * + * Back-end of the driver for virtual block devices. This portion of the + * driver exports a 'unified' block-device interface that can be accessed + * by any operating system that implements a compatible front end. A + * reference front-end implementation can be found in: + * arch/xen/drivers/vblkif/frontend + * + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand + */ + +#include "common.h" + +/* + * These are rather arbitrary. They are fairly large because adjacent requests + * pulled from a communication ring are quite likely to end up being part of + * the same scatter/gather request at the disc. + * + * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW ** + * This will increase the chances of being able to write whole tracks. + * 64 should be enough to keep us competitive with Linux. + */ +#define MAX_PENDING_REQS 64 +#define BATCH_PER_DOMAIN 16 + +/* + * Each outstanding request that we've passed to the lower device layers has a + * 'pending_req' allocated to it. Each buffer_head that completes decrements + * the pendcnt towards zero. When it hits zero, the specified domain has a + * response queued for it, with the saved 'id' passed back. + * + * We can't allocate pending_req's in order, since they may complete out of + * order. We therefore maintain an allocation ring. This ring also indicates + * when enough work has been passed down -- at that point the allocation ring + * will be empty. + */ +static pending_req_t pending_reqs[MAX_PENDING_REQS]; +static unsigned char pending_ring[MAX_PENDING_REQS]; +static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED; +/* NB. We use a different index type to differentiate from shared blk rings. */ +typedef unsigned int PEND_RING_IDX; +#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1)) +static PEND_RING_IDX pending_prod, pending_cons; +#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons) + +static kmem_cache_t *buffer_head_cachep; + +static struct buffer_head *completed_bhs[NR_CPUS] __cacheline_aligned; + +static int lock_buffer(blkif_t *blkif, + unsigned long buffer, + unsigned short size, + int writeable_buffer); +static void unlock_buffer(unsigned long buffer, + unsigned short size, + int writeable_buffer); + +static void io_schedule(unsigned long unused); +static int do_block_io_op(blkif_t *blkif, int max_to_do); +static void dispatch_rw_block_io(blkif_t *blkif, + blk_ring_req_entry_t *req); +static void make_response(blkif_t *blkif, unsigned long id, + unsigned short op, unsigned long st); + + +/****************************************************************** + * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE + */ + +static struct list_head io_schedule_list; +static spinlock_t io_schedule_list_lock; + +static int __on_blkdev_list(blkif_t *blkif) +{ + return blkif->blkdev_list.next != NULL; +} + +static void remove_from_blkdev_list(blkif_t *blkif) +{ + unsigned long flags; + if ( !__on_blkdev_list(blkif) ) return; + spin_lock_irqsave(&io_schedule_list_lock, flags); + if ( __on_blkdev_list(blkif) ) + { + list_del(&blkif->blkdev_list); + blkif->blkdev_list.next = NULL; + blkif_put(blkif); + } + spin_unlock_irqrestore(&io_schedule_list_lock, flags); +} + +static void add_to_blkdev_list_tail(blkif_t *blkif) +{ + unsigned long flags; + if ( __on_blkdev_list(blkif) ) return; + spin_lock_irqsave(&io_schedule_list_lock, flags); + if ( !__on_blkdev_list(blkif) ) + { + list_add_tail(&blkif->blkdev_list, &io_schedule_list); + blkif_get(blkif); + } + spin_unlock_irqrestore(&io_schedule_list_lock, flags); +} + + +/****************************************************************** + * SCHEDULER FUNCTIONS + */ + +static DECLARE_TASKLET(io_schedule_tasklet, io_schedule, 0); + +static void io_schedule(unsigned long unused) +{ + blkif_t *blkif; + struct list_head *ent; + + /* Queue up a batch of requests. */ + while ( (NR_PENDING_REQS < MAX_PENDING_REQS) && + !list_empty(&io_schedule_list) ) + { + ent = io_schedule_list.next; + blkif = list_entry(ent, blkif_t, blkdev_list); + blkif_get(blkif); + remove_from_blkdev_list(blkif); + if ( do_block_io_op(blkif, BATCH_PER_DOMAIN) ) + add_to_blkdev_list_tail(blkif); + blkif_put(blkif); + } + + /* Push the batch through to disc. */ + run_task_queue(&tq_disk); +} + +static void maybe_trigger_io_schedule(void) +{ + /* + * Needed so that two processes, who together make the following predicate + * true, don't both read stale values and evaluate the predicate + * incorrectly. Incredibly unlikely to stall the scheduler on x86, but... + */ + smp_mb(); + + if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) && + !list_empty(&io_schedule_list) ) + tasklet_schedule(&io_schedule_tasklet); +} + + + +/****************************************************************** + * COMPLETION CALLBACK -- Called as bh->b_end_io() + */ + +static void end_block_io_op(struct buffer_head *bh, int uptodate) +{ + pending_req_t *pending_req = bh->b_private; + + /* An error fails the entire request. */ + if ( !uptodate ) + { + DPRINTK("Buffer not up-to-date at end of operation\n"); + pending_req->status = 2; + } + + unlock_buffer(virt_to_phys(bh->b_data), + bh->b_size, + (pending_req->operation==READ)); + + if ( atomic_dec_and_test(&pending_req->pendcnt) ) + { + make_response(pending_req->blkif, pending_req->id, + pending_req->operation, pending_req->status); + blkif_put(pending_req->blkif); + spin_lock(&pend_prod_lock); + pending_ring[MASK_PEND_IDX(pending_prod)] = + pending_req - pending_reqs; + pending_prod++; + spin_unlock(&pend_prod_lock); + maybe_trigger_io_schedule(); + } +} + + + +/****************************************************************************** + * NOTIFICATION FROM GUEST OS. + */ + +void vblkif_be_int(int irq, void *dev_id, struct pt_regs *regs) +{ + blkif_t *blkif = dev_id; + add_to_blkdev_list_tail(blkif); + maybe_trigger_io_schedule(); +} + + + +/****************************************************************** + * DOWNWARD CALLS -- These interface with the block-device layer proper. + */ + +static int lock_buffer(blkif_t *blkif, + unsigned long buffer, + unsigned short size, + int writeable_buffer) +{ + unsigned long pfn; + + for ( pfn = buffer >> PAGE_SHIFT; + pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT); + pfn++ ) + { + } + + return 1; + + fail: + while ( pfn-- > (buffer >> PAGE_SHIFT) ) + { + } + return 0; +} + +static void unlock_buffer(unsigned long buffer, + unsigned short size, + int writeable_buffer) +{ + unsigned long pfn; + + for ( pfn = buffer >> PAGE_SHIFT; + pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT); + pfn++ ) + { + } +} + +static int do_block_io_op(blkif_t *blkif, int max_to_do) +{ + blk_ring_t *blk_ring = blkif->blk_ring_base; + blk_ring_req_entry_t *req; + BLK_RING_IDX i; + int more_to_do = 0; + + /* Take items off the comms ring, taking care not to overflow. */ + for ( i = blkif->blk_req_cons; + (i != blk_ring->req_prod) && ((i-blkif->blk_resp_prod) != + BLK_RING_SIZE); + i++ ) + { + if ( (max_to_do-- == 0) || (NR_PENDING_REQS == MAX_PENDING_REQS) ) + { + more_to_do = 1; + break; + } + + req = &blk_ring->ring[MASK_BLK_IDX(i)].req; + switch ( req->operation ) + { + case BLKIF_OP_READ: + case BLKIF_OP_WRITE: + dispatch_rw_block_io(blkif, req); + break; + + default: + DPRINTK("error: unknown block io operation [%d]\n", + blk_ring->ring[i].req.operation); + make_response(blkif, blk_ring->ring[i].req.id, + blk_ring->ring[i].req.operation, 1); + break; + } + } + + blkif->blk_req_cons = i; + return more_to_do; +} + +static void dispatch_rw_block_io(blkif_t *blkif, + blk_ring_req_entry_t *req) +{ + extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); + struct buffer_head *bh; + int operation = (req->operation == XEN_BLOCK_WRITE) ? WRITE : READ; + unsigned short nr_sects; + unsigned long buffer; + int i, tot_sects; + pending_req_t *pending_req; + + /* We map virtual scatter/gather segments to physical segments. */ + int new_segs, nr_psegs = 0; + phys_seg_t phys_seg[MAX_BLK_SEGS * 2]; + + /* Check that number of segments is sane. */ + if ( unlikely(req->nr_segments == 0) || + unlikely(req->nr_segments > MAX_BLK_SEGS) ) + { + DPRINTK("Bad number of segments in request (%d)\n", req->nr_segments); + goto bad_descriptor; + } + + /* + * Check each address/size pair is sane, and convert into a + * physical device and block offset. Note that if the offset and size + * crosses a virtual extent boundary, we may end up with more + * physical scatter/gather segments than virtual segments. + */ + for ( i = tot_sects = 0; i < req->nr_segments; i++, tot_sects += nr_sects ) + { + buffer = req->buffer_and_sects[i] & ~0x1FF; + nr_sects = req->buffer_and_sects[i] & 0x1FF; + + if ( unlikely(nr_sects == 0) ) + { + DPRINTK("zero-sized data request\n"); + goto bad_descriptor; + } + + phys_seg[nr_psegs].dev = req->device; + phys_seg[nr_psegs].sector_number = req->sector_number + tot_sects; + phys_seg[nr_psegs].buffer = buffer; + phys_seg[nr_psegs].nr_sects = nr_sects; + + /* Translate the request into the relevant 'physical device' */ + new_segs = vbd_translate(&phys_seg[nr_psegs], blkif, operation); + if ( new_segs < 0 ) + { + DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n", + operation == READ ? "read" : "write", + req->sector_number + tot_sects, + req->sector_number + tot_sects + nr_sects, + req->device); + goto bad_descriptor; + } + + nr_psegs += new_segs; + ASSERT(nr_psegs <= MAX_BLK_SEGS*2); + } + + for ( i = 0; i < nr_psegs; i++ ) + { + if ( unlikely(!lock_buffer(blkif, phys_seg[i].buffer, + phys_seg[i].nr_sects << 9, + operation==READ)) ) + { + DPRINTK("invalid buffer\n"); + while ( i-- > 0 ) + unlock_buffer(phys_seg[i].buffer, + phys_seg[i].nr_sects << 9, + operation==READ); + goto bad_descriptor; + } + } + + pending_req = &pending_reqs[pending_ring[MASK_PEND_IDX(pending_cons++)]]; + pending_req->blkif = blkif; + pending_req->id = req->id; + pending_req->operation = operation; + pending_req->status = 0; + atomic_set(&pending_req->pendcnt, nr_psegs); + + blkif_get(blkif); + + /* Now we pass each segment down to the real blkdev layer. */ + for ( i = 0; i < nr_psegs; i++ ) + { + bh = kmem_cache_alloc(buffer_head_cachep, GFP_KERNEL); + if ( unlikely(bh == NULL) ) + panic("bh is null\n"); + memset(bh, 0, sizeof (struct buffer_head)); + + bh->b_size = phys_seg[i].nr_sects << 9; + bh->b_dev = phys_seg[i].dev; + bh->b_rsector = (unsigned long)phys_seg[i].sector_number; + + /* SMH: we store a 'pseudo-virtual' bogus address in b_data since + later code will undo this transformation (i.e. +-PAGE_OFFSET). */ + bh->b_data = phys_to_virt(phys_seg[i].buffer); + + /* SMH: bh_phys() uses the below field as a 'cheap' virt_to_phys */ + bh->b_page = &mem_map[phys_seg[i].buffer>>PAGE_SHIFT]; + bh->b_end_io = end_block_io_op; + bh->b_private = pending_req; + + bh->b_state = (1 << BH_Mapped) | (1 << BH_Lock); + if ( operation == WRITE ) + bh->b_state |= (1 << BH_JBD) | (1 << BH_Req) | (1 << BH_Uptodate); + + atomic_set(&bh->b_count, 1); + + /* Dispatch a single request. We'll flush it to disc later. */ + submit_bh(operation, bh); + } + + return; + + bad_descriptor: + make_response(blkif, req->id, req->operation, 1); +} + + + +/****************************************************************** + * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING + */ + + +static void make_response(blkif_t *blkif, unsigned long id, + unsigned short op, unsigned long st) +{ + blk_ring_resp_entry_t *resp; + + /* Place on the response ring for the relevant domain. */ + spin_lock(&blkif->blk_ring_lock); + resp = &blkif->blk_ring_base-> + ring[MASK_BLK_IDX(blkif->blk_resp_prod)].resp; + resp->id = id; + resp->operation = op; + resp->status = st; + wmb(); + blkif->blk_ring_base->resp_prod = ++blkif->blk_resp_prod; + spin_unlock(&blkif->blk_ring_lock); + + /* Kick the relevant domain. */ + notify_via_evtchn(blkif->evtchn); +} + +static void blkif_debug_int(int irq, void *unused, struct pt_regs *regs) +{ +#if 0 + unsigned long flags; + struct task_struct *p; + blk_ring_t *blk_ring; + int i; + + printk("Dumping block queue stats: nr_pending = %d" + " (prod=0x%08x,cons=0x%08x)\n", + NR_PENDING_REQS, pending_prod, pending_cons); + + read_lock_irqsave(&tasklist_lock, flags); + for_each_domain ( p ) + { + printk("Domain: %llu\n", blkif->domain); + blk_ring = blkif->blk_ring_base; + printk(" req_prod:0x%08x, req_cons:0x%08x resp_prod:0x%08x/" + "0x%08x on_list=%d\n", + blk_ring->req_prod, blkif->blk_req_cons, + blk_ring->resp_prod, blkif->blk_resp_prod, + __on_blkdev_list(p)); + } + read_unlock_irqrestore(&tasklist_lock, flags); + + for ( i = 0; i < MAX_PENDING_REQS; i++ ) + { + printk("Pend%d: dom=%p, id=%08lx, cnt=%d, op=%d, status=%d\n", + i, pending_reqs[i].domain, pending_reqs[i].id, + atomic_read(&pending_reqs[i].pendcnt), + pending_reqs[i].operation, pending_reqs[i].status); + } +#endif +} + +void unlink_blkdev_info(blkif_t *blkif) +{ + unsigned long flags; + + spin_lock_irqsave(&io_schedule_list_lock, flags); + if ( __on_blkdev_list(blkif) ) + { + list_del(&blkif->blkdev_list); + blkif->blkdev_list.next = (void *)0xdeadbeef; + blkif_put(blkif); + } + spin_unlock_irqrestore(&io_schedule_list_lock, flags); +} + +static int __init init_module(void) +{ + int i; + + pending_cons = 0; + pending_prod = MAX_PENDING_REQS; + memset(pending_reqs, 0, sizeof(pending_reqs)); + for ( i = 0; i < MAX_PENDING_REQS; i++ ) + pending_ring[i] = i; + + for ( i = 0; i < NR_CPUS; i++ ) + completed_bhs[i] = NULL; + + spin_lock_init(&io_schedule_list_lock); + INIT_LIST_HEAD(&io_schedule_list); + + if ( request_irq(bind_virq_to_irq(VIRQ_DEBUG), blkif_debug_int, + SA_SHIRQ, "vblkif-backend-dbg", &blkif_debug_int) != 0 ) + printk(KERN_WARNING "Non-fatal error -- no debug interrupt\n"); + + buffer_head_cachep = kmem_cache_create( + "buffer_head_cache", sizeof(struct buffer_head), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + + return 0; +} + +static void cleanup_module(void) +{ +} + +module_init(init_module); +module_exit(cleanup_module); diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/vbd.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/vbd.c new file mode 100644 index 0000000000..89acb63363 --- /dev/null +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/vbd.c @@ -0,0 +1,701 @@ +/****************************************************************************** + * arch/xen/drivers/vblkif/backend/vbd.c + * + * Routines for managing virtual block devices (VBDs). + * + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand + */ + +#include "common.h" + +long __vbd_create(struct task_struct *p, + unsigned short vdevice, + unsigned char mode, + unsigned char type) +{ + vbd_t *vbd; + rb_node_t **rb_p, *rb_parent = NULL; + long ret = 0; + + spin_lock(&p->vbd_lock); + + rb_p = &p->vbd_rb.rb_node; + while ( *rb_p != NULL ) + { + rb_parent = *rb_p; + vbd = rb_entry(rb_parent, vbd_t, rb); + if ( vdevice < vbd->vdevice ) + { + rb_p = &rb_parent->rb_left; + } + else if ( vdevice > vbd->vdevice ) + { + rb_p = &rb_parent->rb_right; + } + else + { + DPRINTK("vbd_create attempted for already existing vbd\n"); + ret = -EINVAL; + goto out; + } + } + + if ( unlikely((vbd = kmalloc(sizeof(vbd_t), GFP_KERNEL)) == NULL) ) + { + DPRINTK("vbd_create: out of memory\n"); + ret = -ENOMEM; + goto out; + } + + vbd->vdevice = vdevice; + vbd->mode = mode; + vbd->type = type; + vbd->extents = NULL; + + rb_link_node(&vbd->rb, rb_parent, rb_p); + rb_insert_color(&vbd->rb, &p->vbd_rb); + + out: + spin_unlock(&p->vbd_lock); + return ret; +} + + +long vbd_create(vbd_create_t *create) +{ + struct task_struct *p; + long rc; + + if ( unlikely(!IS_PRIV(current)) ) + return -EPERM; + + if ( unlikely((p = find_domain_by_id(create->domain)) == NULL) ) + { + DPRINTK("vbd_create attempted for non-existent domain %llu\n", + create->domain); + return -EINVAL; + } + + rc = __vbd_create(p, create->vdevice, create->mode, + XD_TYPE_DISK | XD_FLAG_VIRT); + + put_task_struct(p); + + return rc; +} + + +long __vbd_grow(struct task_struct *p, + unsigned short vdevice, + xen_extent_t *extent) +{ + xen_extent_le_t **px, *x; + vbd_t *vbd = NULL; + rb_node_t *rb; + long ret = 0; + + spin_lock(&p->vbd_lock); + + rb = p->vbd_rb.rb_node; + while ( rb != NULL ) + { + vbd = rb_entry(rb, vbd_t, rb); + if ( vdevice < vbd->vdevice ) + rb = rb->rb_left; + else if ( vdevice > vbd->vdevice ) + rb = rb->rb_right; + else + break; + } + + if ( unlikely(vbd == NULL) || unlikely(vbd->vdevice != vdevice) ) + { + DPRINTK("vbd_grow: attempted to append extent to non-existent VBD.\n"); + ret = -EINVAL; + goto out; + } + + if ( unlikely((x = kmalloc(sizeof(xen_extent_le_t), GFP_KERNEL)) == NULL) ) + { + DPRINTK("vbd_grow: out of memory\n"); + ret = -ENOMEM; + goto out; + } + + x->extent.device = extent->device; + x->extent.start_sector = extent->start_sector; + x->extent.nr_sectors = extent->nr_sectors; + x->next = (xen_extent_le_t *)NULL; + + for ( px = &vbd->extents; *px != NULL; px = &(*px)->next ) + continue; + + *px = x; + + out: + spin_unlock(&p->vbd_lock); + return ret; +} + + +/* Grow a VBD by appending a new extent. Fails if the VBD doesn't exist. */ +long vbd_grow(vbd_grow_t *grow) +{ + struct task_struct *p; + long rc; + + if ( unlikely(!IS_PRIV(current)) ) + return -EPERM; + + if ( unlikely((p = find_domain_by_id(grow->domain)) == NULL) ) + { + DPRINTK("vbd_grow: attempted for non-existent domain %llu\n", + grow->domain); + return -EINVAL; + } + + rc = __vbd_grow(p, grow->vdevice, &grow->extent); + + put_task_struct(p); + + return rc; +} + + +long vbd_shrink(vbd_shrink_t *shrink) +{ + struct task_struct *p; + xen_extent_le_t **px, *x; + vbd_t *vbd = NULL; + rb_node_t *rb; + long ret = 0; + + if ( !IS_PRIV(current) ) + return -EPERM; + + if ( (p = find_domain_by_id(shrink->domain)) == NULL ) + { + DPRINTK("vbd_shrink attempted for non-existent domain %llu\n", + shrink->domain); + return -EINVAL; + } + + spin_lock(&p->vbd_lock); + + rb = p->vbd_rb.rb_node; + while ( rb != NULL ) + { + vbd = rb_entry(rb, vbd_t, rb); + if ( shrink->vdevice < vbd->vdevice ) + rb = rb->rb_left; + else if ( shrink->vdevice > vbd->vdevice ) + rb = rb->rb_right; + else + break; + } + + if ( unlikely(vbd == NULL) || + unlikely(vbd->vdevice != shrink->vdevice) || + unlikely(vbd->extents == NULL) ) + { + DPRINTK("vbd_shrink: attempt to remove non-existent extent.\n"); + ret = -EINVAL; + goto out; + } + + /* Find the last extent. We now know that there is at least one. */ + for ( px = &vbd->extents; (*px)->next != NULL; px = &(*px)->next ) + continue; + + x = *px; + *px = x->next; + kfree(x); + + out: + spin_unlock(&p->vbd_lock); + put_task_struct(p); + return ret; +} + + +long vbd_setextents(vbd_setextents_t *setextents) +{ + struct task_struct *p; + xen_extent_t e; + xen_extent_le_t *new_extents, *x, *t; + vbd_t *vbd = NULL; + rb_node_t *rb; + int i; + long ret = 0; + + if ( !IS_PRIV(current) ) + return -EPERM; + + if ( (p = find_domain_by_id(setextents->domain)) == NULL ) + { + DPRINTK("vbd_setextents attempted for non-existent domain %llu\n", + setextents->domain); + return -EINVAL; + } + + spin_lock(&p->vbd_lock); + + rb = p->vbd_rb.rb_node; + while ( rb != NULL ) + { + vbd = rb_entry(rb, vbd_t, rb); + if ( setextents->vdevice < vbd->vdevice ) + rb = rb->rb_left; + else if ( setextents->vdevice > vbd->vdevice ) + rb = rb->rb_right; + else + break; + } + + if ( unlikely(vbd == NULL) || + unlikely(vbd->vdevice != setextents->vdevice) ) + { + DPRINTK("vbd_setextents: attempt to modify non-existent VBD.\n"); + ret = -EINVAL; + goto out; + } + + /* Construct the new extent list. */ + new_extents = NULL; + for ( i = setextents->nr_extents - 1; i >= 0; i-- ) + { + if ( unlikely(copy_from_user(&e, + &setextents->extents[i], + sizeof(e)) != 0) ) + { + DPRINTK("vbd_setextents: copy_from_user failed\n"); + ret = -EFAULT; + goto free_and_out; + } + + if ( unlikely((x = kmalloc(sizeof(xen_extent_le_t), GFP_KERNEL)) + == NULL) ) + { + DPRINTK("vbd_setextents: out of memory\n"); + ret = -ENOMEM; + goto free_and_out; + } + + x->extent = e; + x->next = new_extents; + + new_extents = x; + } + + /* Delete the old extent list _after_ successfully creating the new. */ + for ( x = vbd->extents; x != NULL; x = t ) + { + t = x->next; + kfree(x); + } + + /* Make the new list visible. */ + vbd->extents = new_extents; + + out: + spin_unlock(&p->vbd_lock); + put_task_struct(p); + return ret; + + free_and_out: + /* Failed part-way through the new list. Delete all that we managed. */ + for ( x = new_extents; x != NULL; x = t ) + { + t = x->next; + kfree(x); + } + goto out; +} + + +long vbd_delete(vbd_delete_t *delete) +{ + struct task_struct *p; + vbd_t *vbd; + rb_node_t *rb; + xen_extent_le_t *x, *t; + + if( !IS_PRIV(current) ) + return -EPERM; + + if ( (p = find_domain_by_id(delete->domain)) == NULL ) + { + DPRINTK("vbd_delete attempted for non-existent domain %llu\n", + delete->domain); + return -EINVAL; + } + + spin_lock(&p->vbd_lock); + + rb = p->vbd_rb.rb_node; + while ( rb != NULL ) + { + vbd = rb_entry(rb, vbd_t, rb); + if ( delete->vdevice < vbd->vdevice ) + rb = rb->rb_left; + else if ( delete->vdevice > vbd->vdevice ) + rb = rb->rb_right; + else + goto found; + } + + DPRINTK("vbd_delete attempted for non-existing VBD.\n"); + + spin_unlock(&p->vbd_lock); + put_task_struct(p); + return -EINVAL; + + found: + rb_erase(rb, &p->vbd_rb); + x = vbd->extents; + kfree(vbd); + + while ( x != NULL ) + { + t = x->next; + kfree(x); + x = t; + } + + spin_unlock(&p->vbd_lock); + put_task_struct(p); + return 0; +} + + +void destroy_all_vbds(struct task_struct *p) +{ + vbd_t *vbd; + rb_node_t *rb; + xen_extent_le_t *x, *t; + + spin_lock(&p->vbd_lock); + + while ( (rb = p->vbd_rb.rb_node) != NULL ) + { + vbd = rb_entry(rb, vbd_t, rb); + + rb_erase(rb, &p->vbd_rb); + x = vbd->extents; + kfree(vbd); + + while ( x != NULL ) + { + t = x->next; + kfree(x); + x = t; + } + } + + spin_unlock(&p->vbd_lock); +} + + +static int vbd_probe_single(xen_disk_info_t *xdi, + vbd_t *vbd, + struct task_struct *p) +{ + xen_extent_le_t *x; + xen_disk_t cur_disk; + + if ( xdi->count == xdi->max ) + { + DPRINTK("vbd_probe_devices: out of space for probe.\n"); + return -ENOMEM; + } + + cur_disk.device = vbd->vdevice; + cur_disk.info = vbd->type; + if ( !VBD_CAN_WRITE(vbd) ) + cur_disk.info |= XD_FLAG_RO; + cur_disk.capacity = 0ULL; + for ( x = vbd->extents; x != NULL; x = x->next ) + cur_disk.capacity += x->extent.nr_sectors; + cur_disk.domain = p->domain; + + /* Now copy into relevant part of user-space buffer */ + if( copy_to_user(&xdi->disks[xdi->count], + &cur_disk, + sizeof(xen_disk_t)) ) + { + DPRINTK("vbd_probe_devices: copy_to_user failed\n"); + return -EFAULT; + } + + xdi->count++; + + return 0; +} + + +static int vbd_probe_devices(xen_disk_info_t *xdi, struct task_struct *p) +{ + int rc = 0; + rb_node_t *rb; + + spin_lock(&p->vbd_lock); + + if ( (rb = p->vbd_rb.rb_node) == NULL ) + goto out; + + new_subtree: + /* STEP 1. Find least node (it'll be left-most). */ + while ( rb->rb_left != NULL ) + rb = rb->rb_left; + + for ( ; ; ) + { + /* STEP 2. Dealt with left subtree. Now process current node. */ + if ( (rc = vbd_probe_single(xdi, rb_entry(rb, vbd_t, rb), p)) != 0 ) + goto out; + + /* STEP 3. Process right subtree, if any. */ + if ( rb->rb_right != NULL ) + { + rb = rb->rb_right; + goto new_subtree; + } + + /* STEP 4. Done both subtrees. Head back through ancesstors. */ + for ( ; ; ) + { + /* We're done when we get back to the root node. */ + if ( rb->rb_parent == NULL ) + goto out; + /* If we are left of parent, then parent is next to process. */ + if ( rb->rb_parent->rb_left == rb ) + break; + /* If we are right of parent, then we climb to grandparent. */ + rb = rb->rb_parent; + } + + rb = rb->rb_parent; + } + + out: + spin_unlock(&p->vbd_lock); + return rc; +} + + +/* + * Return information about the VBDs available for a given domain, or for all + * domains; in the general case the 'domain' argument will be 0 which means + * "information about the caller"; otherwise the 'domain' argument will + * specify either a given domain, or all domains ("VBD_PROBE_ALL") -- both of + * these cases require the caller to be privileged. + */ +long vbd_probe(vbd_probe_t *probe) +{ + struct task_struct *p = NULL; + unsigned long flags; + long ret = 0; + + if ( probe->domain != 0 ) + { + /* We can only probe for ourselves (unless we're privileged). */ + if( (probe->domain != current->domain) && !IS_PRIV(current) ) + return -EPERM; + + if ( (probe->domain != VBD_PROBE_ALL) && + ((p = find_domain_by_id(probe->domain)) == NULL) ) + { + DPRINTK("vbd_probe attempted for non-existent domain %llu\n", + probe->domain); + return -EINVAL; + } + } + else + { + /* Default is to probe for ourselves. */ + p = current; + get_task_struct(p); /* to mirror final put_task_struct */ + } + + if ( probe->domain == VBD_PROBE_ALL ) + { + read_lock_irqsave(&tasklist_lock, flags); + for_each_domain ( p ) + { + if ( (ret = vbd_probe_devices(&probe->xdi, p)) != 0 ) + { + read_unlock_irqrestore(&tasklist_lock, flags); + goto out; + } + } + read_unlock_irqrestore(&tasklist_lock, flags); + } + else if ( (ret = vbd_probe_devices(&probe->xdi, p)) != 0 ) + goto out; + + out: + if ( ret != 0 ) + DPRINTK("vbd_probe: err %ld in probing virtual devices\n", ret); + if ( p != NULL ) + put_task_struct(p); + return ret; +} + + +long vbd_info(vbd_info_t *info) +{ + struct task_struct *p; + xen_extent_le_t *x; + xen_extent_t *extents; + vbd_t *vbd = NULL; + rb_node_t *rb; + long ret = 0; + + if ( (info->domain != current->domain) && !IS_PRIV(current) ) + return -EPERM; + + if ( (p = find_domain_by_id(info->domain)) == NULL ) + { + DPRINTK("vbd_info attempted for non-existent domain %llu\n", + info->domain); + return -EINVAL; + } + + spin_lock(&p->vbd_lock); + + rb = p->vbd_rb.rb_node; + while ( rb != NULL ) + { + vbd = rb_entry(rb, vbd_t, rb); + if ( info->vdevice < vbd->vdevice ) + rb = rb->rb_left; + else if ( info->vdevice > vbd->vdevice ) + rb = rb->rb_right; + else + break; + } + + if ( unlikely(vbd == NULL) || unlikely(vbd->vdevice != info->vdevice) ) + { + DPRINTK("vbd_info attempted on non-existent VBD.\n"); + ret = -EINVAL; + goto out; + } + + info->mode = vbd->mode; + info->nextents = 0; + + extents = info->extents; + for ( x = vbd->extents; x != NULL; x = x->next ) + { + if ( info->nextents == info->maxextents ) + break; + if ( copy_to_user(extents, &x->extent, sizeof(xen_extent_t)) ) + { + DPRINTK("vbd_info: copy_to_user failed\n"); + ret = -EFAULT; + goto out; + } + extents++; + info->nextents++; + } + + out: + spin_unlock(&p->vbd_lock); + put_task_struct(p); + return ret; +} + + +int vbd_translate(phys_seg_t *pseg, struct task_struct *p, int operation) +{ + xen_extent_le_t *x; + vbd_t *vbd; + rb_node_t *rb; + xen_sector_t sec_off; + unsigned long nr_secs; + + spin_lock(&p->vbd_lock); + + rb = p->vbd_rb.rb_node; + while ( rb != NULL ) + { + vbd = rb_entry(rb, vbd_t, rb); + if ( pseg->dev < vbd->vdevice ) + rb = rb->rb_left; + else if ( pseg->dev > vbd->vdevice ) + rb = rb->rb_right; + else + goto found; + } + + DPRINTK("vbd_translate; domain %llu attempted to access " + "non-existent VBD.\n", p->domain); + + spin_unlock(&p->vbd_lock); + return -ENODEV; + + found: + + if ( ((operation == READ) && !VBD_CAN_READ(vbd)) || + ((operation == WRITE) && !VBD_CAN_WRITE(vbd)) ) + { + spin_unlock(&p->vbd_lock); + return -EACCES; + } + + /* + * Now iterate through the list of xen_extents, working out which should + * be used to perform the translation. + */ + sec_off = pseg->sector_number; + nr_secs = pseg->nr_sects; + for ( x = vbd->extents; x != NULL; x = x->next ) + { + if ( sec_off < x->extent.nr_sectors ) + { + pseg->dev = x->extent.device; + pseg->sector_number = x->extent.start_sector + sec_off; + if ( unlikely((sec_off + nr_secs) > x->extent.nr_sectors) ) + goto overrun; + spin_unlock(&p->vbd_lock); + return 1; + } + sec_off -= x->extent.nr_sectors; + } + + DPRINTK("vbd_translate: end of vbd.\n"); + spin_unlock(&p->vbd_lock); + return -EACCES; + + /* + * Here we deal with overrun onto the following extent. We don't deal with + * overrun of more than one boundary since each request is restricted to + * 2^9 512-byte sectors, so it should be trivial for control software to + * ensure that extents are large enough to prevent excessive overrun. + */ + overrun: + + /* Adjust length of first chunk to run to end of first extent. */ + pseg[0].nr_sects = x->extent.nr_sectors - sec_off; + + /* Set second chunk buffer and length to start where first chunk ended. */ + pseg[1].buffer = pseg[0].buffer + (pseg[0].nr_sects << 9); + pseg[1].nr_sects = nr_secs - pseg[0].nr_sects; + + /* Now move to the next extent. Check it exists and is long enough! */ + if ( unlikely((x = x->next) == NULL) || + unlikely(x->extent.nr_sectors < pseg[1].nr_sects) ) + { + DPRINTK("vbd_translate: multiple overruns or end of vbd.\n"); + spin_unlock(&p->vbd_lock); + return -EACCES; + } + + /* Store the real device and start sector for the second chunk. */ + pseg[1].dev = x->extent.device; + pseg[1].sector_number = x->extent.start_sector; + + spin_unlock(&p->vbd_lock); + return 2; +} diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/vnetif/backend/main.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/vnetif/backend/main.c new file mode 100644 index 0000000000..b0e77ab522 --- /dev/null +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/vnetif/backend/main.c @@ -0,0 +1,26 @@ +/****************************************************************************** + * arch/xen/drivers/vnetif/backend/main.c + * + * Back-end of the driver for virtual block devices. This portion of the + * driver exports a 'unified' block-device interface that can be accessed + * by any operating system that implements a compatible front end. A + * reference front-end implementation can be found in: + * arch/xen/drivers/vnetif/frontend + * + * Copyright (c) 2004, K A Fraser + */ + +#include +#include + +static int __init init_module(void) +{ + return 0; +} + +static void cleanup_module(void) +{ +} + +module_init(init_module); +module_exit(cleanup_module); diff --git a/xenolinux-2.4.26-sparse/arch/xen/kernel/ctrl_if.c b/xenolinux-2.4.26-sparse/arch/xen/kernel/ctrl_if.c index 4002ae4c61..d21b0f90b5 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/kernel/ctrl_if.c +++ b/xenolinux-2.4.26-sparse/arch/xen/kernel/ctrl_if.c @@ -15,8 +15,7 @@ #include #include #include -#include -#include +#include static int ctrl_if_evtchn; static int ctrl_if_irq; @@ -50,10 +49,7 @@ static DECLARE_TASKLET(ctrl_if_rx_tasklet, __ctrl_if_rx_tasklet, 0); static void ctrl_if_notify_controller(void) { - evtchn_op_t evtchn_op; - evtchn_op.cmd = EVTCHNOP_send; - evtchn_op.u.send.local_port = ctrl_if_evtchn; - (void)HYPERVISOR_event_channel_op(&evtchn_op); + notify_via_evtchn(ctrl_if_evtchn); } static void ctrl_if_rxmsg_default_handler(ctrl_msg_t *msg, unsigned long id) diff --git a/xenolinux-2.4.26-sparse/arch/xen/kernel/traps.c b/xenolinux-2.4.26-sparse/arch/xen/kernel/traps.c index b98e0cc6bb..0337cae1ca 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/kernel/traps.c +++ b/xenolinux-2.4.26-sparse/arch/xen/kernel/traps.c @@ -321,7 +321,11 @@ asmlinkage void do_general_protection(struct pt_regs * regs, long error_code) u.ptr = MMU_EXTENDED_COMMAND; u.ptr |= (unsigned long)&default_ldt[0]; u.val = MMUEXT_SET_LDT | (5 << MMUEXT_CMD_SHIFT); - HYPERVISOR_mmu_update(&u, 1); + if ( unlikely(HYPERVISOR_mmu_update(&u, 1) < 0) ) + { + show_trace(NULL); + panic("Failed to install default LDT"); + } return; } } diff --git a/xenolinux-2.4.26-sparse/arch/xen/mm/hypervisor.c b/xenolinux-2.4.26-sparse/arch/xen/mm/hypervisor.c index 39f6863d66..c6dc710576 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/mm/hypervisor.c +++ b/xenolinux-2.4.26-sparse/arch/xen/mm/hypervisor.c @@ -116,7 +116,8 @@ static inline void __flush_page_update_queue(void) #endif idx = 0; wmb(); /* Make sure index is cleared first to avoid double updates. */ - HYPERVISOR_mmu_update(update_queue, _idx); + if ( unlikely(HYPERVISOR_mmu_update(update_queue, _idx) < 0) ) + panic("Failed to execute MMU updates"); } void _flush_page_update_queue(void) @@ -182,8 +183,8 @@ void queue_invlpg(unsigned long ptr) unsigned long flags; spin_lock_irqsave(&update_lock, flags); update_queue[idx].ptr = MMU_EXTENDED_COMMAND; - update_queue[idx].val = ptr & PAGE_MASK; - update_queue[idx].val |= MMUEXT_INVLPG; + update_queue[idx].ptr |= ptr & PAGE_MASK; + update_queue[idx].val = MMUEXT_INVLPG; increment_index(); spin_unlock_irqrestore(&update_lock, flags); } diff --git a/xenolinux-2.4.26-sparse/arch/xen/mm/ioremap.c b/xenolinux-2.4.26-sparse/arch/xen/mm/ioremap.c index 7b1162de9c..665357d4bc 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/mm/ioremap.c +++ b/xenolinux-2.4.26-sparse/arch/xen/mm/ioremap.c @@ -31,10 +31,28 @@ static inline void direct_remap_area_pte(pte_t *pte, unsigned long address, unsigned long size, unsigned long machine_addr, - pgprot_t prot) + pgprot_t prot, + domid_t domid) { unsigned long end; + mmu_update_t *u, *v; + u = v = vmalloc(3*PAGE_SIZE); /* plenty */ + + /* If not I/O mapping then specify General-Purpose Subject Domain (GPS). */ + if ( domid != 0 ) + { + v[0].val = (unsigned long)(domid<<16) & ~0xFFFFUL; + v[0].ptr = (unsigned long)(domid<< 0) & ~0xFFFFUL; + v[1].val = (unsigned long)(domid>>16) & ~0xFFFFUL; + v[1].ptr = (unsigned long)(domid>>32) & ~0xFFFFUL; + v[0].ptr |= MMU_EXTENDED_COMMAND; + v[0].val |= MMUEXT_SET_SUBJECTDOM_L; + v[1].ptr |= MMU_EXTENDED_COMMAND; + v[1].val |= MMUEXT_SET_SUBJECTDOM_H; + v += 2; + } + address &= ~PMD_MASK; end = address + size; if (end > PMD_SIZE) @@ -46,11 +64,18 @@ static inline void direct_remap_area_pte(pte_t *pte, printk("direct_remap_area_pte: page already exists\n"); BUG(); } - set_pte(pte, pte_mkio(direct_mk_pte_phys(machine_addr, prot))); + v->ptr = virt_to_machine(pte); + v->val = (machine_addr & PAGE_MASK) | pgprot_val(prot) | _PAGE_IO; + v++; address += PAGE_SIZE; machine_addr += PAGE_SIZE; pte++; } while (address && (address < end)); + + if ( ((v-u) != 0) && (HYPERVISOR_mmu_update(u, v-u) < 0) ) + printk(KERN_WARNING "Failed to ioremap %08lx->%08lx (%08lx)\n", + end-size, end, machine_addr-size); + vfree(u); } static inline int direct_remap_area_pmd(struct mm_struct *mm, @@ -58,7 +83,8 @@ static inline int direct_remap_area_pmd(struct mm_struct *mm, unsigned long address, unsigned long size, unsigned long machine_addr, - pgprot_t prot) + pgprot_t prot, + domid_t domid) { unsigned long end; @@ -74,7 +100,7 @@ static inline int direct_remap_area_pmd(struct mm_struct *mm, if (!pte) return -ENOMEM; direct_remap_area_pte(pte, address, end - address, - address + machine_addr, prot); + address + machine_addr, prot, domid); address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); @@ -85,7 +111,8 @@ int direct_remap_area_pages(struct mm_struct *mm, unsigned long address, unsigned long machine_addr, unsigned long size, - pgprot_t prot) + pgprot_t prot, + domid_t domid) { int error = 0; pgd_t * dir; @@ -103,7 +130,7 @@ int direct_remap_area_pages(struct mm_struct *mm, if (!pmd) break; error = direct_remap_area_pmd(mm, pmd, address, end - address, - machine_addr + address, prot); + machine_addr + address, prot, domid); if (error) break; address = (address + PGDIR_SIZE) & PGDIR_MASK; @@ -158,7 +185,7 @@ void * __ioremap(unsigned long machine_addr, prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | flags); if (direct_remap_area_pages(&init_mm, VMALLOC_VMADDR(addr), - machine_addr, size, prot)) { + machine_addr, size, prot, 0)) { vfree(addr); return NULL; } diff --git a/xenolinux-2.4.26-sparse/drivers/char/mem.c b/xenolinux-2.4.26-sparse/drivers/char/mem.c index dbc10d6382..1d3ec0fe05 100644 --- a/xenolinux-2.4.26-sparse/drivers/char/mem.c +++ b/xenolinux-2.4.26-sparse/drivers/char/mem.c @@ -197,24 +197,11 @@ static inline int noncached_address(unsigned long addr) #endif } +#if !defined(CONFIG_XEN) static int mmap_mem(struct file * file, struct vm_area_struct * vma) { unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; -#if defined(CONFIG_XEN) && defined(CONFIG_XEN_PRIVILEGED_GUEST) - if (!(start_info.flags & SIF_PRIVILEGED)) - return -ENXIO; - - /* DONTCOPY is essential for Xen as copy_page_range is broken. */ - vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY; - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - if (direct_remap_area_pages(vma->vm_mm, vma->vm_start, offset, - vma->vm_end-vma->vm_start, vma->vm_page_prot)) - return -EAGAIN; - return 0; -#elif defined(CONFIG_XEN) - return -ENXIO; -#else /* * Accessing memory above the top the kernel knows about or * through a file pointer that was marked O_SYNC will be @@ -236,8 +223,50 @@ static int mmap_mem(struct file * file, struct vm_area_struct * vma) vma->vm_page_prot)) return -EAGAIN; return 0; -#endif } +#elif !defined(CONFIG_XEN_PRIVILEGED_GUEST) +static int mmap_mem(struct file * file, struct vm_area_struct * vma) +{ + return -ENXIO; +} +#else +static int mmap_mem(struct file * file, struct vm_area_struct * vma) +{ + unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + domid_t domid; + + if (!(start_info.flags & SIF_PRIVILEGED)) + return -ENXIO; + + domid = file->private_data ? *(domid_t *)file->private_data : 0; + + /* DONTCOPY is essential for Xen as copy_page_range is broken. */ + vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY; + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + if (direct_remap_area_pages(vma->vm_mm, vma->vm_start, offset, + vma->vm_end-vma->vm_start, vma->vm_page_prot, + domid)) + return -EAGAIN; + return 0; +} +static int ioctl_mem(struct inode * inode, struct file * file, unsigned int cmd, unsigned long arg) +{ + if (file->private_data == NULL) + file->private_data = kmalloc(sizeof(domid_t), GFP_KERNEL); + switch (cmd) { + case _IO('M', 1): ((unsigned long *)file->private_data)[0]=arg; break; + case _IO('M', 2): ((unsigned long *)file->private_data)[1]=arg; break; + default: return -ENOSYS; + } + return 0; +} +static int release_mem(struct inode * inode, struct file * file) +{ + if (file->private_data != NULL) + kfree(file->private_data); + return 0; +} +#endif /* CONFIG_XEN */ /* * This function reads the *virtual* memory as seen by the kernel. @@ -426,10 +455,6 @@ static inline size_t read_zero_pagealigned(char * buf, size_t size) goto out_up; if (vma->vm_flags & VM_SHARED) break; -#if defined(CONFIG_XEN_PRIVILEGED_GUEST) - if (vma->vm_flags & VM_IO) - break; -#endif count = vma->vm_end - addr; if (count > size) count = size; @@ -615,10 +640,6 @@ static int mmap_kmem(struct file * file, struct vm_area_struct * vma) unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; unsigned long size = vma->vm_end - vma->vm_start; -#if defined(CONFIG_XEN) - return -ENXIO; -#endif - /* * If the user is not attempting to mmap a high memory address then * the standard mmap_mem mechanism will work. High memory addresses @@ -663,13 +684,19 @@ static struct file_operations mem_fops = { write: write_mem, mmap: mmap_mem, open: open_mem, +#if defined(CONFIG_XEN_PRIVILEGED_GUEST) + release: release_mem, + ioctl: ioctl_mem, +#endif }; static struct file_operations kmem_fops = { llseek: memory_lseek, read: read_kmem, write: write_kmem, +#if !defined(CONFIG_XEN) mmap: mmap_kmem, +#endif open: open_kmem, }; @@ -715,12 +742,6 @@ static int memory_open(struct inode * inode, struct file * filp) break; #if defined(CONFIG_ISA) || !defined(__mc68000__) case 4: -#if defined(CONFIG_XEN) -#if defined(CONFIG_XEN_PRIVILEGED_GUEST) - if (!(start_info.flags & SIF_PRIVILEGED)) -#endif - return -ENXIO; -#endif filp->f_op = &port_fops; break; #endif diff --git a/xenolinux-2.4.26-sparse/include/asm-xen/ctrl_if.h b/xenolinux-2.4.26-sparse/include/asm-xen/ctrl_if.h index 9d12487144..f1d2b77c2e 100644 --- a/xenolinux-2.4.26-sparse/include/asm-xen/ctrl_if.h +++ b/xenolinux-2.4.26-sparse/include/asm-xen/ctrl_if.h @@ -9,6 +9,7 @@ #ifndef __ASM_XEN__CTRL_IF_H__ #define __ASM_XEN__CTRL_IF_H__ +#include #include typedef control_msg_t ctrl_msg_t; diff --git a/xenolinux-2.4.26-sparse/include/asm-xen/evtchn.h b/xenolinux-2.4.26-sparse/include/asm-xen/evtchn.h index ececad9447..128d766a34 100644 --- a/xenolinux-2.4.26-sparse/include/asm-xen/evtchn.h +++ b/xenolinux-2.4.26-sparse/include/asm-xen/evtchn.h @@ -14,6 +14,7 @@ #include #include #include +#include /* * LOW-LEVEL DEFINITIONS @@ -62,6 +63,14 @@ static inline void clear_evtchn_exception(int port) synch_clear_bit(port, &s->evtchn_exception[0]); } +static inline void notify_via_evtchn(int port) +{ + evtchn_op_t op; + op.cmd = EVTCHNOP_send; + op.u.send.local_port = port; + (void)HYPERVISOR_event_channel_op(&op); +} + /* * CHARACTER-DEVICE DEFINITIONS */ diff --git a/xenolinux-2.4.26-sparse/include/asm-xen/hypervisor.h b/xenolinux-2.4.26-sparse/include/asm-xen/hypervisor.h index e20f67e651..c454728c0e 100644 --- a/xenolinux-2.4.26-sparse/include/asm-xen/hypervisor.h +++ b/xenolinux-2.4.26-sparse/include/asm-xen/hypervisor.h @@ -161,13 +161,6 @@ static inline int HYPERVISOR_mmu_update(mmu_update_t *req, int count) : "=a" (ret) : "0" (__HYPERVISOR_mmu_update), "b" (req), "c" (count) : "memory" ); - if ( unlikely(ret < 0) ) - { - extern void show_trace(unsigned long *); - show_trace(NULL); - panic("Failed mmu update: %p, %d", req, count); - } - return ret; } diff --git a/xenolinux-2.4.26-sparse/include/asm-xen/pgalloc.h b/xenolinux-2.4.26-sparse/include/asm-xen/pgalloc.h index 308a1b7c40..d853a3f2af 100644 --- a/xenolinux-2.4.26-sparse/include/asm-xen/pgalloc.h +++ b/xenolinux-2.4.26-sparse/include/asm-xen/pgalloc.h @@ -265,10 +265,15 @@ static inline void flush_tlb_pgtables(struct mm_struct *mm, XEN_flush_page_update_queue(); } +/* + * NB. The 'domid' field should be zero if mapping I/O space (non RAM). + * Otherwise it identifies the owner of the memory that is being mapped. + */ extern int direct_remap_area_pages(struct mm_struct *mm, unsigned long address, unsigned long machine_addr, unsigned long size, - pgprot_t prot); + pgprot_t prot, + domid_t domid); #endif /* _I386_PGALLOC_H */